/*
Input: LTW2018_Straßenverzeichnis_einzeln_mit Filter.xlsx
		> raw excel file comprising full electoral roll of 2018 state election (SE-18 / LTW18)
		including voters' home addresses

Output: $tmp/strassen_prep_ltw18_single.dta

Main task:
	> the comprehensive list of addresses will serve as reference for geocoding
	> clean street names for geocoding and for merge with electoral rolls from other elections
	> addresses are identified by merge_strasse (harmonized street name) and nummer (address number)
	
*/


* READ: Excel file containing home addresses, SE-18 (156,261)
	import excel "$rawdata/election_office/electoral_rolls/LTW2018_Straßenverzeichnis_einzeln_mit Filter.xlsx", firstrow clear
	
*rename
	ren		STRASSE strasse
	ren		HNr hnr
	ren		BUCHSTABE buchstabe
	ren		PLZ plz
	ren		STIMMBEZIRK sb_
	ren		STIMMKREIS wahlkreis_ltw18
	ren		STADTBEZIRK stadtbez
	lab var stadtbez "city district"
	
* generate number of the address containg also letters (e.g., 6 + a -> 6a)
	tostring hnr, replace
	gen 	nummer = hnr + buchstabe
	lab var nummer "address number"

* keep only necessary variables: street, number, district
	drop hnr buchstabe plz sb_ wahlkreis_ltw18
	
* Clean street names for geocoding 
	//harmonizing apreviations and comon words with different possible spellings
	replace strasse = subinstr(strasse,"St.","Sankt",.)
	replace strasse = subinstr(strasse,"Südl.","Südliche",.)
	replace strasse = subinstr(strasse,"Alle","Allee",.)
	replace strasse = subinstr(strasse,"Alleee","Allee",.)
	replace strasse = subinstr(strasse,"pl.","platz",.)
	replace strasse = subinstr(strasse,"Pl.","Platz",.)
	replace strasse = subinstr(strasse,"str.","straße",.)
	replace strasse = subinstr(strasse,"Str.","Straße",.)
	replace strasse = subinstr(strasse,"straße","strasse",.)
	replace strasse = subinstr(strasse,"Straße","Strasse",.)
	
	//correct typos
	replace strasse="Allensteiner Strasse" if strasse=="Alleensteiner Strasse"
	replace strasse="Allescherstrasse" if strasse=="Alleescherstrasse"
	
* Harmonize street names 
*	gen merge_strasse := harmonized street names
	gen 	merge_strasse = strasse
	lab var strasse 	  "cleaned street name for geocoder"
	lab var merge_strasse "harmonized street name for merging"
	
	replace merge_strasse = subinstr(merge_strasse,"Dr.","Doktor",.)
	replace merge_strasse = subinstr(merge_strasse,"Prof.","Professor",.)
	replace merge_strasse = subinstr(merge_strasse," ","",.)
	replace merge_strasse = lower(merge_strasse)
	replace merge_strasse = subinstr(merge_strasse,"-","",.)
	replace merge_strasse = subinstr(merge_strasse,".","",.)
	replace merge_strasse = subinstr(merge_strasse,"'","",.)
	replace merge_strasse = subinstr(merge_strasse,"ß","ss",.)
	replace merge_strasse = subinstr(merge_strasse,"ä","ae",.)
	replace merge_strasse = subinstr(merge_strasse,"ö","oe",.)
	replace merge_strasse = subinstr(merge_strasse,"ü","ue",.)


* generate city variable for geocoding
	gen 	stadt = "München"
	lab var stadt "city name for geocoder"

* save
	isid merge_strasse nummer	
	save "$tmp/strassen_prep_ltw18_single.dta", replace
